#########################################################
## @file   : txpath.py
## @desc   : test xpath
## @create : 2020/07/08
## @author : Chengan
## @email  : douboer@gmail.com
#########################################################

from lxml import etree
from collections import defaultdict
import html
import json
import re

with open('./tdouban.data.t', 'r', encoding='utf8', errors='ignore') as f:
    resp=f.read()

#for s in resp.split('\n'):
#    print(s)

dom = etree.HTML(resp)
result = dom.xpath("//div[@class='result']")
#print(dom.xpath('//div[@class="result"]/div/text()'))

idx = 0
bkinfo = defaultdict(dict)
for s in result:
    ts = etree.tostring(s).decode('utf8')
    #print('###',html.unescape(ts))

    #xps = s.xpath('div[0]/text()')
    #print('$$',xps)

    """
    for ss in s:
        ts = etree.tostring(ss).decode('utf8')
        print('$$',html.unescape(ts))
        #print(ss)
    """

    """
    ts = etree.tostring(s).decode('utf8')
    print('$$',html.unescape(ts))
    """

    #for ss in xps:
    #    ts = etree.tostring(ss).decode('utf8')
    #    print('$$$',html.unescape(ts))


    """
    _s = etree.tostring(s).decode('utf8')

    # html.unescape(_s) - 将字符串 s 中的所有命名和数字字符引用 (例如 &gt;, &#62;, &#x3e;)
    #     转换为相应的Unicode字符。 
    #     此函数使用HTML 5标准为有效和无效字符引用定义的规则，以及 HTML 5 命名字符引用列表。
    # html.unescape(_s) - 这里unicode -> gbk ?
    # https://blog.csdn.net/nanhuaibeian/article/details/86514175
    print(html.unescape(_s))
    """
    #print(t.find_all('span'))

    re_sid = re.compile('^.+sid: (\d+),.+$')

    sp = s.xpath('div/div/h3/span[1]')
    if not sp: continue

    ts = etree.tostring(sp[0]).decode('utf8')
    typ = html.unescape(ts)
    if not '书籍' in typ: continue
    else:
        #nbg = s.xpath('div/a')[0]
        #ts = etree.tostring(nbg[0]).decode('utf8')
        #print(html.unescape(ts))

        #print  urllib.quote(s)
        #print  urllib.quote(s.decode('utf-8').encode('gbk'))
        #print  urllib.unquote(urllib.quote(s.decode('utf-8').encode('gbk'))).decode('gbk')
        #print  urllib.unquote(urllib.quote(s)).decode('utf-8')

        #<a class="nbg" href="https://www.douban.com/link2/?url=https%3A%2F%2Fbook.douban.com%2Fsubject%2F3845101%2F&amp;query=24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE&amp;cat_id=1001&amp;type=search&amp;pos=0" onclick="moreurl(this,{i: '0', query: '24%E5%A0%82%E8%B4%A2%E5%AF%8C%E8%AF%BE', from: 'dou_search_book', sid: 3845101, qcat: ''})" target="_blank" title="24堂财富课"><img src="https://img9.doubanio.com/view/subject/s/public/s29357535.jpg"/></a>
        # url格式转gbk  !!!
        bookhref = s.xpath('div/a/@href')[0]
        title = s.xpath('div/a/@title')[0]
        sid = re_sid.search(str(s.xpath('div/a/@onclick')[0])).group(1)
        #sid = s.xpath('div/a/@onclick')[0]
        imghref = s.xpath('div/a/img/@src')[0]

        #rating_info = s.xpath("div[@class='rating-info']")
        ri = s.xpath("div/div/div/span")
        rating_info = list(map(lambda x : html.unescape(etree.tostring(x).decode('utf8')).strip(), ri))
        if len(rating_info) == 4:
            star = re.search(r'class=\"(.+)\"', str(rating_info[0])).group(1)
            rating_score = re.search(r'.+rating_nums\">(.+)<\/', rating_info[1]).group(1)
            rating_number = re.search(r'.+>\((.+)\)<', rating_info[2]).group(1)
            tt = re.search(r'.+subject-cast\">(.+)<\/', rating_info[3]).group(1).split(' / ')
        elif len(rating_info) == 3:
            star = re.search(r'class=\"(.+)\"', rating_info[0]).group(1)
            rating_score = 0
            rating_number = re.search(r'.+>\((.+)\)<', rating_info[1]).group(1)
            tt = re.search(r'.+subject-cast\">(.+)<\/', rating_info[2]).group(1).split(' / ')
        else: continue

        if len(tt)>=3:
            *au, publisher, publishing=tt
            author='/'.join(au)
        else:
            author=tt[0]

        sid = '-'.join([sid,str(idx)])
        description = s.xpath('div/p')

        bkinfo[sid]['bookname'] = title
        bkinfo[sid]['link'] = bookhref
        bkinfo[sid]['img'] = imghref
        bkinfo[sid]['star'] = star
        bkinfo[sid]['score'] = rating_score
        bkinfo[sid]['ratingnum'] = rating_number
        bkinfo[sid]['author'] = author
        bkinfo[sid]['publisher'] = publisher
        bkinfo[sid]['publishing'] = publishing 
        bkinfo[sid]['description'] = description[0].text if description else None

        """
        from urllib.parse import unquote
        print('\n')
        print('== unquote(bookhref) - ', unquote(bookhref))
        print('== title - ', title)
        print('== sid - ', sid)
        print('== imghref - ', imghref)
        """

        idx += 1

print(json.dumps(bkinfo,indent=2, ensure_ascii=False))
#print(bkinfo)
